d <-read_csv(paste0(home, "/output/temp_data_for_fitting.csv"))#> Rows: 98616 Columns: 13#> ── Column specification ────────────────────────────────────────────────────────#> Delimiter: ","#> chr (4): area, source, db, id#> dbl (8): year, temp, yday, month, day, VkDag, stn_nr, section_nr#> date (1): date#> #> ℹ Use `spec()` to retrieve the full column specification for this data.#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.d <- d |>mutate(area =as.factor(area),source_f =as.factor(source),year_f =as.factor(year))#> mutate: converted 'area' from character to factor (0 new NA)#> new variable 'source_f' (factor) with 3 unique values and 0% NA#> new variable 'year_f' (factor) with 83 unique values and 0% NA# Keep track of the years for which we have cohorts for matching with cohort datagdat <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/perch-growth/master/data/for-analysis/dat.csv")#> Rows: 338460 Columns: 12#> ── Column specification ────────────────────────────────────────────────────────#> Delimiter: ","#> chr (6): age_ring, area, gear, ID, sex, keep#> dbl (6): length_mm, age_bc, age_catch, catch_year, cohort, final_length#> #> ℹ Use `spec()` to retrieve the full column specification for this data.#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.gdat$area_cohort_age <-as.factor(paste(gdat$area, gdat$cohort, gdat$age_bc))gdat <- gdat |>group_by(area_cohort_age) |>filter(n() >10)#> group_by: one grouping variable (area_cohort_age)#> filter (grouped): removed 5,190 rows (2%), 333,270 rows remaininggdat <- gdat |>filter(age_catch >3) |>group_by(area) |>summarise(min =min(cohort),max =max(cohort)) |>arrange(min)#> filter (grouped): removed 107,058 rows (32%), 226,212 rows remaining#> group_by: one grouping variable (area)#> summarise: now 12 rows and 3 columns, ungroupedd <-left_join(d, gdat, by ="area") |>mutate(area =as.factor(area),growth_dat =ifelse(year >= min & year <= max, "Y", "N"))#> left_join: added 2 columns (min, max)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 98,616#> > ========#> > rows total 98,616#> mutate: converted 'area' from character to factor (0 new NA)#> new variable 'growth_dat' (character) with 2 unique values and 0% NA# Drop data in SI_HA and BT before onset of warming?d2 <- d |>mutate(discard ="N",discard =ifelse(area =="BT"& year <=1980, "Y", discard),discard =ifelse(area =="SI_HA"& year <=1972, "Y", discard)) |>filter(discard =="N")#> mutate: new variable 'discard' (character) with 2 unique values and 0% NA#> filter: removed 1,146 rows (1%), 97,470 rows remaining
Fit models
Source as independent or interactive effect
Code
m <-sdmTMB(temp ~ area*year_f + source_f +s(yday, by = area, bs ="cc"), data = d,family =student(df =5),spatial ="off",spatiotemporal ="off",knots =list(yday =c(0.5, 364.5)),control =sdmTMBcontrol(newton_loops =1))
# Make a new data frame and predict!nd <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),area =unique(d$area),source =unique(d$source_f),year =unique(d$year))) |>mutate(source_f =as.factor(source),year_f =as.factor(year)) |># Left join in growth data columnleft_join(gdat, by ="area") |>mutate(area =as.factor(area),growth_dat =ifelse(year >= min & year <= max, "Y", "N"))#> mutate: new variable 'source_f' (factor) with 3 unique values and 0% NA#> new variable 'year_f' (factor) with 83 unique values and 0% NA#> left_join: added 2 columns (min, max)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 1,093,608#> > ===========#> > rows total 1,093,608#> mutate: converted 'area' from character to factor (0 new NA)#> new variable 'growth_dat' (character) with 2 unique values and 0% NA# Predictnd$pred <-predict(m, newdata = nd)$est
In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas.. so FM informs BT prior to nuclear
Code
nd <- nd |>mutate(keep ="Y",keep =ifelse(area =="BT"& year <1980, "N", keep),keep =ifelse(area =="SI_HA"& year <1972, "N", keep)) |>filter(keep =="Y")#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA#> filter: removed 79,056 rows (7%), 1,014,552 rows remainingnd_sub <- nd |>mutate(keep ="N",keep =ifelse(area =="FM"& year <1980, "Y", keep), # use FM instead of BTkeep =ifelse(area =="SI_EK"& year <1972, "Y", keep)) |># use SI_EK instead of SI_HAfilter(keep =="Y")#> mutate: changed 935,496 values (92%) of 'keep' (0 new NA)#> filter: removed 935,496 rows (92%), 79,056 rows remaining# Now change the labels to BT and SI_EK...nd_sub <- nd_sub |>mutate(area =ifelse(area =="FM", "BT", "SI_HA"))#> mutate: converted 'area' from factor to character (0 new NA)# Bind rows and plot only the temperature series we will use for growth modellingnd <-bind_rows(nd, nd_sub) |>select(-keep)#> select: dropped one variable (keep)
Plot predictions
Code
# Trimmed plotnd |>filter(growth_dat =="Y") |>ggplot(aes(yday, y = year, fill = pred, color = pred)) +geom_raster() +facet_wrap(~area, ncol =3) +scale_fill_viridis(option ="magma") +scale_color_viridis(option ="magma") +labs(x ="Yearday", y ="Year", color ="Predicted SST (°C)", fill ="Predicted SST (°C)")#> filter: removed 648,918 rows (59%), 444,690 rows remaining
Code
# Full plotnd |>ggplot(aes(yday, y = year, fill = pred, color = pred)) +geom_raster() +facet_wrap(~area, ncol =3) +scale_fill_viridis(option ="magma") +scale_color_viridis(option ="magma") +labs(x ="Yearday", y ="Year", color ="Predicted SST (°C)", fill ="Predicted SST (°C)")
Detailed exploration of predictions
Code
# Loop trough all areas, plot temperature as a function of yday, color by data source, facet by yearfor(i inunique(nd$area)) { plotdat <- nd |>filter(area == i)print(ggplot(plotdat, aes(yday, pred, color = source)) +scale_color_brewer(palette ="Dark2") +facet_wrap(~year) +geom_point(data =filter(d, area == i & year >min(plotdat$year)), size =0.2,aes(yday, temp, color = source)) +geom_line(linewidth =0.3) +labs(title =paste("Area = ", i), color ="", linetype ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position =c(0.8, 0.08), legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +labs(x ="Day of the year", y ="Predicted SST (°C)") )ggsave(paste0(home, "/figures/supp/full_model/temp_pred_yday_area_", i, ".pdf" ), width =17, height =17, units ="cm")}#> filter: removed 1,002,474 rows (92%), 91,134 rows remaining#> filter: removed 95,167 rows (97%), 3,449 rows remaining#> filter: removed 1,002,474 rows (92%), 91,134 rows remaining#> filter: removed 89,164 rows (90%), 9,452 rows remaining
# Summarise datadsum <- d |>group_by(year, area, source) |>summarise(temp =mean(temp)) |>mutate(type ="data")#> group_by: 3 grouping variables (year, area, source)#> summarise: now 1,676 rows and 4 columns, 2 group variables remaining (year, area)#> mutate (grouped): new variable 'type' (character) with one unique value and 0% NA# Summarise predictions from full model and area-specific modelpreds <- nd |>filter(growth_dat =="Y"& source =="logger") |>group_by(area, year) |>summarise(temp =mean(pred)) |>mutate(model ="full model")#> filter: removed 945,378 rows (86%), 148,230 rows remaining#> group_by: 2 grouping variables (area, year)#> summarise: now 405 rows and 3 columns, one group variable remaining (area)#> mutate (grouped): new variable 'model' (character) with one unique value and 0% NApreds_area <- nd_area |>filter(growth_dat =="Y"& source =="logger") |>group_by(area, year) |>summarise(temp =mean(pred)) |>mutate(model ="area model")#> filter: removed 846,247 rows (86%), 141,536 rows remaining#> group_by: 2 grouping variables (area, year)#> summarise: now 393 rows and 3 columns, one group variable remaining (area)#> mutate (grouped): new variable 'model' (character) with one unique value and 0% NApreds_comb <-bind_rows(preds, preds_area)ggplot(preds_comb, aes(year, temp, color = source, linetype = model)) +geom_point(data = dsum, aes(year, temp, color = source), size =0.75, alpha =0.75, inherit.aes =FALSE) +scale_color_brewer(palette ="Accent") +geom_line(linewidth =0.5, color ="grey20") +facet_wrap(~area) +theme(legend.position ="bottom")
Make final plot using the area-specific model
Code
order <- preds_area |>group_by(area) |>summarise(mean_temp =mean(temp)) |>arrange(desc(mean_temp))#> group_by: one grouping variable (area)#> summarise: now 11 rows and 2 columns, ungroupedorder#> # A tibble: 11 × 2#> area mean_temp#> <chr> <dbl>#> 1 SI_HA 14.1 #> 2 BT 12.2 #> 3 TH 9.30#> 4 FM 8.93#> 5 JM 8.84#> 6 BS 8.55#> 7 SI_EK 8.27#> 8 FB 8.20#> 9 MU 6.96#> 10 RA 6.61#> 11 HO 6.60# Save plot order..topt <-11.8# Overall t_opt from 02-fit-vbge.qmd! Update if needed# Add latitudearea <-c("BS", "BT", "FB", "FM", "HO", "JM", "MU", "RA", "SI_EK", "SI_HA", "TH")nareas <-length(area)lat <-c(60, 60.4, 60.3, 60.5, 63.7, 58, 59, 65.9, 57.3, 57.4, 56.1)lon <-c(21.5, 18.1, 19.5, 18, 20.9, 16.8, 18.1, 22.3, 16.6, 16.7, 15.9)area_attr <-data.frame(cbind(area = area, lat = lat, lon = lon)) |>mutate_at(c("lat","lon"), as.numeric) |>arrange(desc(lat))#> mutate_at: converted 'lat' from character to double (0 new NA)#> converted 'lon' from character to double (0 new NA)ggplot(preds_area, aes(year, temp, color = temp)) +facet_wrap(~factor(area, levels = area_attr$area), ncol =3) +geom_hline(yintercept = topt, linewidth =0.3, linetype =2, color ="grey") +geom_line() +labs(x ="Year", y ="Model-predicted annual average temperature") +scale_color_viridis(option ="magma", name ="Area") +guides(color ="none")
Code
preds |>group_by(area) |>summarise(min =min(year),max =max(year)) |>arrange(min)#> group_by: one grouping variable (area)#> summarise: now 12 rows and 3 columns, ungrouped#> # A tibble: 12 × 3#> area min max#> <chr> <dbl> <dbl>#> 1 JM 1953 2016#> 2 BT 1963 2000#> 3 FM 1963 2000#> 4 SI_EK 1968 2015#> 5 SI_HA 1968 2014#> 6 FB 1969 2016#> 7 MU 1981 2000#> 8 HO 1982 2016#> 9 BS 1985 2002#> 10 RA 1985 2006#> 11 VN 1987 1998#> 12 TH 2000 2014ggsave(paste0(home, "/figures/annual_average_temperature.pdf"), width =17, height =17, units ="cm")
Code
# Save prediction dfwrite_csv(preds, paste0(home, "/output/gam_predicted_temps.csv"))
Code below is not evaluated!
Growing season? This might be different for different areas…
Code
# Find day of the year where temperature exceeds 10C by area across years# TODO: Also for area-specific predictions?gs_area <- nd |>group_by(area, yday) |>summarise(mean_pred =mean(pred)) |>ungroup() |>filter(mean_pred >10) |>group_by(area) |>summarise(gs_min =min(yday),gs_max =max(yday))#> group_by: 2 grouping variables (area, yday)#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)#> ungroup: no grouping variables#> filter: removed 2,556 rows (58%), 1,836 rows remaining#> group_by: one grouping variable (area)#> summarise: now 12 rows and 3 columns, ungroupednd <-left_join(nd, gs_area, by ="area")#> left_join: added 2 columns (gs_min, gs_max)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 1,093,608#> > ===========#> > rows total 1,093,608gs_area$mean_pred <-10# Plot!nd |>#filter(growth_dat == "Y") |> group_by(area, yday) |>summarise(mean_pred =mean(pred)) |>ggplot(aes(yday, mean_pred)) +geom_line() +labs(x ="Yearday", y ="Year", color ="Predicted SST (°C)", fill ="Predicted SST (°C)") +facet_wrap(~area) +geom_point(data = gs_area, aes(gs_min, mean_pred), inherit.aes =FALSE, color ="tomato2") +geom_point(data = gs_area, aes(gs_max, mean_pred), inherit.aes =FALSE, color ="tomato2") +geom_hline(yintercept =10, linetype =2)#> group_by: 2 grouping variables (area, yday)#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)
Now see if there is a systematic pattern in the difference between predicted and observed logger data, which could indicate that the source effect isn’t global but area-specific.
Code
dlog <- d |>filter(source =="logger") |>mutate(type ="data",id =paste(area, year, yday, sep ="_")) |>select(id, temp) |>group_by(id) |>summarise(obs =mean(temp)) # sometimes we have more than 1 observation per id#> filter: removed 34,632 rows (35%), 63,984 rows remaining#> mutate: changed 63,984 values (100%) of 'id' (63984 fewer NA)#> new variable 'type' (character) with one unique value and 0% NA#> select: dropped 17 variables (year, area, yday, month, date, …)#> group_by: one grouping variable (id)#> summarise: now 60,967 rows and 2 columns, ungrouped# dlog |> # group_by(id) |> # summarise(n = n()) |> # distinct(n)preds_log <- nd |>filter(growth_dat =="Y"& source =="logger") |>mutate(type ="model",id =paste(area, year, yday, sep ="_")) |>filter(id %in%unique(dlog$id)) |>ungroup() |>left_join(dlog, by ="id")#> filter: removed 945,378 rows (86%), 148,230 rows remaining#> mutate: new variable 'type' (character) with one unique value and 0% NA#> new variable 'id' (character) with 148,230 unique values and 0% NA#> filter: removed 107,373 rows (72%), 40,857 rows remaining#> ungroup: no grouping variables#> left_join: added one column (obs)#> > rows only in x 0#> > rows only in y (20,110)#> > matched rows 40,857#> > ========#> > rows total 40,857p1 <- preds_log |>mutate(resid = pred - obs) |>ggplot(aes(as.factor(area), resid, group =as.factor(area))) +#geom_jitter(alpha = 0.05, color = "grey20", height = 0, width = 0.2) + geom_violin(fill ="grey70", color =NA) +geom_boxplot(width =0.2, outlier.colour =NA, outlier.color =NA, outlier.fill =NA) +guides(color ="none") +geom_hline(yintercept =0, linetype =2, color ="tomato3", linewidth =0.75) +labs(x ="Area", y ="Manual residuals")#> mutate: new variable 'resid' (double) with 40,857 unique values and 0% NAp1
Code
p1 +facet_wrap(~year)
Some extra plots for BT especially, because it seems that the offset for logger isn’t that big in years without any logger data.. First predict BT only but with all data sources. Note this code is not run anymore and is instead expanded above to work on all areas
Code
# Predictndbt <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),year =seq(1980, #filter(gdat, area == "BT")$min, 2004), #filter(gdat, area == "BT")$min),source =unique(d$source))) |>mutate(area ="BT") |>mutate(area =as.factor(area),source_f =as.factor(source),year_f =as.factor(year)) ndbt$pred <-predict(m, newdata = ndbt)$est# Plotggplot(ndbt, aes(yday, pred, color = source)) +scale_color_brewer(palette ="Accent") +facet_wrap(~year) +geom_point(data =filter(d, area =="BT"& year >=min(ndbt$year) & year <=max(ndbt$year)), size =0.2,aes(yday, temp)) +geom_line() +labs(title ="Area = BT", color ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position ="bottom", legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +labs(x ="Day of the year", y ="Predicted SST (°C)")ggsave(paste0(home, "/figures/supp/BT_test_plots.pdf" ), width =17, height =17, units ="cm")# Right, so there is an offset... but it's not big. And the cold temperatures in# certain years in the 1980's is not due to the offset not working, just that the year# effect in that year is only informed by cold temperatures and there's no "memory"... # Though it seems also that there could be a bigger offset.. perhaps this indicates# the source should indeed vary by area? Maybe, we if we fit models by area separately instead? Then we don't# need the area interaction, and we can instead use a random walk or AR1 structure one# the year effect?dbt <- d |>filter(area =="BT")mbt <-sdmTMB(temp ~0+ source_f + year_f +s(yday, bs ="cc"), data = dbt,family =student(df =5),spatial ="off",spatiotemporal ="off",knots =list(yday =c(0.5, 364.5)),control =sdmTMBcontrol(newton_loops =1))sanity(mbt)summary(mbt)# Predictndbt2 <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),year =seq(1980, #filter(gdat, area == "BT")$min, 2004), #filter(gdat, area == "BT")$min),source =unique(d$source))) |>mutate(source_f =as.factor(source),year_f =as.factor(year)) ndbt2$pred <-predict(mbt, newdata = ndbt2)$est# Plot without dataggplot(filter(d, area =="BT"& year >=min(ndbt$year) & year <=max(ndbt$year)),aes(yday, temp, color = source)) +geom_line(data = ndbt, aes(yday, pred, color = source, linetype ="main model"), alpha =0.2) +geom_line(data = ndbt2, aes(yday, pred, color = source, linetype ="BT specific model"), alpha =0.2) +scale_color_brewer(palette ="Dark2") +facet_wrap(~year) +labs(title ="Area = BT", color ="", linetype ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position ="bottom", legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +labs(x ="Day of the year", y ="Predicted SST (°C)")ggsave(paste0(home, "/figures/supp/BT_test_plots2.pdf" ), width =17, height =17, units ="cm")# Plot with dataggplot(filter(d, area =="BT"& year >=min(ndbt$year) & year <=max(ndbt$year)),aes(yday, temp, color = source)) +geom_point(size =0.001, alpha =0.3) +geom_line(data = ndbt, aes(yday, pred, color = source, linetype ="main model"), alpha =0.7) +geom_line(data = ndbt2, aes(yday, pred, color = source, linetype ="BT specific model"), alpha =0.7) +scale_color_brewer(palette ="Dark2") +facet_wrap(~year) +labs(title ="Area = BT", color ="", linetype ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position ="bottom", legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +labs(x ="Day of the year", y ="Predicted SST (°C)")ggsave(paste0(home, "/figures/supp/BT_test_plots3.pdf" ), width =17, height =17, units ="cm")# Plot only new predictions, color by year facet by source?p1 <-ggplot() +geom_line(data = ndbt2, aes(yday, pred, color = year, group = year), alpha =0.7, linewidth =0.2) +scale_color_viridis() +facet_wrap(~source) +labs(title ="Area = BT", color ="", linetype ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position ="bottom", legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +coord_cartesian(ylim =c(0, 25)) +labs(x ="Day of the year", y ="Predicted SST (°C)", title ="BT specific model")p2 <-ggplot() +geom_line(data = ndbt, aes(yday, pred, color = year, group = year), alpha =0.7, linewidth =0.2) +scale_color_viridis() +facet_wrap(~source) +labs(title ="Area = BT", color ="", linetype ="") +guides(color =guide_legend(title.position ="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position ="bottom", legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +coord_cartesian(ylim =c(0, 25)) +labs(x ="Day of the year", y ="Predicted SST (°C)", title ="main model")p1 / p2ggsave(paste0(home, "/figures/supp/BT_test_plots4.pdf" ), width =17, height =17, units ="cm")# Now plot the annual means. Focus on 1980-2004, so that we don't need to worry about# the Forsmark predictions in years before heatingndbt_sum <- ndbt |>filter(source =="logger") |>group_by(year) |>summarise(mean_pred =mean(pred)) |>mutate(model ="main model")ndbt2_sum <- ndbt2 |>filter(source =="logger") |>group_by(year) |>summarise(mean_pred =mean(pred)) |>mutate(model ="BT-specific model")bt_sums <-bind_rows(ndbt_sum, ndbt2_sum)ggplot(bt_sums, aes(year, mean_pred, color = model)) +geom_line() +labs(x ="Year", y ="Model-predicted annual average temperature") +scale_color_brewer(palette ="Accent")ggsave(paste0(home, "/figures/supp/BT_test_plots5.pdf" ), width =17, height =17, units ="cm")
If we want to get uncertainty, we can use nsim instead; this simulates from the linear predictor using the inverse precision matrix, which is a fast way to get a distribution of samples from which we can take e.g. quantiles and means. However, it’s still slow, so the code below isn’t executed yet.
Code
nd_sim <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),area =unique(d$area),year =unique(d$year))) |>mutate(source ="logger") |>mutate(id =paste(year, area, sep ="_"),source_f =as.factor(source),year_f =as.factor(year))# Trim!nd_sim <-left_join(nd_sim, gdat, by ="area")nd_sim <- nd_sim |>mutate(growth_dat =ifelse(year > min, "Y", "N")) |>filter(growth_dat =="Y") |>filter(yday %in%c(gs_min:gs_min)) |>mutate(area =as.factor(area))# Predict!nsim <-500m_pred_sims <-predict(m, newdata = nd_sim, nsim = nsim)# Join sims with prediction datand_sim_long <-cbind(nd_sim, as.data.frame(m_pred_sims)) |>pivot_longer(c((ncol(nd_sim) +1):(nsim +ncol(nd_sim))))# Summarize sims over growing seasonsum_pred_gs <- nd_sim_long |>ungroup() |>group_by(year, area) |>summarise(lwr =quantile(value, prob =0.1),est =quantile(value, prob =0.5),upr =quantile(value, prob =0.9)) |>ungroup()# In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas..sum_pred_gs <- preds |>mutate(keep ="Y",keep =ifelse(area =="BT"& year <1980, "N", keep),keep =ifelse(area =="SI_HA"& year <1972, "N", keep)) |>filter(keep =="Y")sum_pred_gs_sub <- preds |>mutate(keep ="N",keep =ifelse(area =="FM"& year <1980, "Y", keep), # use FM instead of BTkeep =ifelse(area =="SI_EK"& year <1972, "Y", keep)) |># use SI_EK instead of SI_HAfilter(keep =="Y")# Now change the labels to BT and SI_EK...sum_pred_gs_sub <- sum_pred_gs_sub |>mutate(area =ifelse(area =="FM", "BT", "SI_HA"))# Bind rows and plot only the temperature series we will use for growth modellingsum_pred_gs <-bind_rows(sum_pred_gs, sum_pred_gs_sub) |>select(-keep, -type)order <- sum_pred_gs |>group_by(area) |>summarise(mean_temp =mean(temp)) |>arrange(desc(mean_temp))